
import requests
from bs4 import BeautifulSoup
import re	



 #       headers = {'Referer':'https://accounts.pixiv.net/loginlang=zh&source=pc&view_type=page&ref=wwwtop_accounts_index',
 #          'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'
 #         }
 #       r = requests.get(url, timeout=30, headers=headers)
#get id and name 
#url = 'http://quote.eastmoney.com/stock_list.html'
#html=requests.get(url)
#soup=BeautifulSoup(html.content,'html.parser')
#stocks_in = soup.find_all(class_="odd")

#for stock_in in stocks_in:
#    print(stock_in.get_text())

import requests
from bs4 import BeautifulSoup
import re
 
#优化，可以减少程序判断编码所花费的时间




def getHTMLText(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print("获取网页内容失败！")


def parsePage(html):
    list = [] # 我用的二维数组存储
    read = []
    comment  = []
    title  = []
    author  = []
    time  = []
    try:
        # print(html)
        soup = BeautifulSoup(html, "html.parser")
        for each in soup.find_all('span', 'l1 a1'):
            if '万' in each.string:
                each.string = each.string[:-2]
            read.append(each.string)
        read = read[1:] # read[0] == '阅读'
        list.append(read)
        for each in soup.find_all('span', 'l2 a2'):
            comment.append(each.string)
        comment = comment[1:]  # comment[0] == '评论'
        list.append(comment)
        for each in soup.find_all('span', 'l3 a3'):
           # print(each.string)
           #first = each.select('a:nth-of-type(1)')
           #for i in first:
           #    i.find_all("a")
           #    # print(i.title)
           title.append(each.string)
        list.append(title)
        for each in soup.find_all('span', 'l4 a4'):
            print(each.text)
            #first = each.select('font:nth-of-type(1)')
            #for i in first:
            #    i.find_all("font")
            #    # print(i.title)
            author.append(each.text)
        list.append(author)
        for each in soup.find_all('span', 'l5 a5'):
            time.append(each.string)
        time = time[1:] # time[0] == '最后更新'
        list.append(time)
    except:
        print("解析网页字段失败！")
    return list

 
print(parsePage(getHTMLText("https://guba.eastmoney.com/list,601868.html")))

